First step is to install the required packages.
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Then, import the data into a dataframe.
jan_df = read_csv('~/Google Certification Program/Case Study/202201-divvy-tripdata.csv')
## Rows: 103770 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
feb_df = read_csv('~/Google Certification Program/Case Study/202202-divvy-tripdata.csv')
## Rows: 115609 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mar_df = read_csv('~/Google Certification Program/Case Study/202203-divvy-tripdata.csv')
## Rows: 284042 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
apr_df = read_csv('~/Google Certification Program/Case Study/202204-divvy-tripdata.csv')
## Rows: 371249 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
may_df = read_csv('~/Google Certification Program/Case Study/202205-divvy-tripdata.csv')
## Rows: 634858 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
jun_df = read_csv('~/Google Certification Program/Case Study/202206-divvy-tripdata.csv')
## Rows: 769204 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
jul_df = read_csv('~/Google Certification Program/Case Study/202207-divvy-tripdata.csv')
## Rows: 823488 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
aug_df = read_csv('~/Google Certification Program/Case Study/202208-divvy-tripdata.csv')
## Rows: 785932 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
sep_df = read_csv('~/Google Certification Program/Case Study/202209-divvy-tripdata.csv')
## Rows: 701339 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
oct_df = read_csv('~/Google Certification Program/Case Study/202210-divvy-tripdata.csv')
## Rows: 558685 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nov_df = read_csv('~/Google Certification Program/Case Study/202211-divvy-tripdata.csv')
## Rows: 337735 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dec_df = read_csv('~/Google Certification Program/Case Study/202212-divvy-tripdata.csv')
## Rows: 181806 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): ride_id, rideable_type, start_station_name, start_station_id, end_...
## dbl (4): start_lat, start_lng, end_lat, end_lng
## dttm (2): started_at, ended_at
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
Next step is to combine all the monthly data together to form a yearly data, so that we may analyze the whole year instead of just a single month.
To do so, we must first check if all the column names and datatypes in the dataframes are consistent with one another to prevent issues from occuring during the merging process.
compare_df_cols_same(
jan_df,
feb_df,
mar_df,
apr_df,
may_df,
jun_df,
jul_df,
aug_df,
sep_df,
oct_df,
nov_df,
dec_df,
bind_method = c("bind_rows", "rbind"),
verbose = TRUE
)
## [1] TRUE
Since the function returns a TRUE value, we can go ahead with the
merging of the dataframes. This can be done effectively by using the
bind function from the dplyr package.
year_2022 <- bind_rows(
jan_df,
feb_df,
mar_df,
apr_df,
may_df,
jun_df,
jul_df,
aug_df,
sep_df,
oct_df,
nov_df,
dec_df
)
Let’s check out the resulting dataframe:
head(year_2022)
glimpse(year_2022)
## Rows: 5,667,717
## Columns: 13
## $ ride_id <chr> "C2F7DD78E82EC875", "A6CF8980A652D272", "BD0F91DFF7…
## $ rideable_type <chr> "electric_bike", "electric_bike", "classic_bike", "…
## $ started_at <dttm> 2022-01-13 11:59:47, 2022-01-10 08:41:56, 2022-01-…
## $ ended_at <dttm> 2022-01-13 12:02:44, 2022-01-10 08:46:17, 2022-01-…
## $ start_station_name <chr> "Glenwood Ave & Touhy Ave", "Glenwood Ave & Touhy A…
## $ start_station_id <chr> "525", "525", "TA1306000016", "KA1504000151", "TA13…
## $ end_station_name <chr> "Clark St & Touhy Ave", "Clark St & Touhy Ave", "Gr…
## $ end_station_id <chr> "RP-007", "RP-007", "TA1307000001", "TA1309000021",…
## $ start_lat <dbl> 42.01280, 42.01276, 41.92560, 41.98359, 41.87785, 4…
## $ start_lng <dbl> -87.66591, -87.66597, -87.65371, -87.66915, -87.624…
## $ end_lat <dbl> 42.01256, 42.01256, 41.92533, 41.96151, 41.88462, 4…
## $ end_lng <dbl> -87.67437, -87.67437, -87.66580, -87.67139, -87.627…
## $ member_casual <chr> "casual", "casual", "member", "casual", "member", "…
member_count <- year_2022 %>% tabyl(member_casual)
print(member_count)
## member_casual n percent
## casual 2322032 0.4096944
## member 3345685 0.5903056
member_tabyl <- year_2022 %>% tabyl(member_casual, rideable_type)
print(member_tabyl)
## member_casual classic_bike docked_bike electric_bike
## casual 891459 177474 1253099
## member 1709755 0 1635930
The tabyl above shows how casuals can pick between classic bikes, docked bikes, and electric bikes, while members can only pick between classic bikes and electric bikes. Since this seems like a point of interest, I did a bit of research on the internet and found out that near the end of 2020, Divvy introduced the classic bike as a replacement for the docked bikes. Since then, members can only choose between either class or electric bikes, while casual members can pick between the three. Therefore, I will combine the classic bike and docked bike so that I can get a proper comparison of bike preference between casuals and members.
year_2022$rideable_type[year_2022$rideable_type == 'docked_bike'] <- 'classic_bike'
##sort(unique(year_2022$start_station_name))
year_2022 <- year_2022 %>%
mutate(day_of_ride = weekdays(as.Date(year_2022$started_at)))
head(year_2022)
year_2022 <- year_2022 %>%
mutate(month_of_ride = month.name[month(as.Date(year_2022$started_at))])
head(year_2022)
year_2022 <- year_2022 %>%
mutate(hour_of_ride = hour(year_2022$started_at))
head(year_2022)
year_2022 <- year_2022 %>%
mutate(ride_length = difftime(year_2022$ended_at, year_2022$started_at, units="mins"))
head(year_2022)
library(skimr)
skim_without_charts(year_2022)
| Name | year_2022 |
| Number of rows | 5667717 |
| Number of columns | 17 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| difftime | 1 |
| numeric | 5 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1.00 | 16 | 16 | 0 | 5667717 | 0 |
| rideable_type | 0 | 1.00 | 12 | 13 | 0 | 2 | 0 |
| start_station_name | 833064 | 0.85 | 7 | 64 | 0 | 1674 | 0 |
| start_station_id | 833064 | 0.85 | 3 | 44 | 0 | 1313 | 0 |
| end_station_name | 892742 | 0.84 | 9 | 64 | 0 | 1692 | 0 |
| end_station_id | 892742 | 0.84 | 3 | 44 | 0 | 1317 | 0 |
| member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
| day_of_ride | 0 | 1.00 | 6 | 9 | 0 | 7 | 0 |
| month_of_ride | 0 | 1.00 | 3 | 9 | 0 | 12 | 0 |
Variable type: difftime
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| ride_length | 0 | 1 | -10353.35 mins | 41387.25 mins | 10.28 mins | 22312 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.90 | 0.05 | 41.64 | 41.88 | 41.90 | 41.93 | 45.64 |
| start_lng | 0 | 1 | -87.65 | 0.03 | -87.84 | -87.66 | -87.64 | -87.63 | -73.80 |
| end_lat | 5858 | 1 | 41.90 | 0.07 | 0.00 | 41.88 | 41.90 | 41.93 | 42.37 |
| end_lng | 5858 | 1 | -87.65 | 0.11 | -88.14 | -87.66 | -87.64 | -87.63 | 0.00 |
| hour_of_ride | 0 | 1 | 14.22 | 5.03 | 0.00 | 11.00 | 15.00 | 18.00 | 23.00 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2022-01-01 00:00:05 | 2022-12-31 23:59:26 | 2022-07-22 15:03:59 | 4745862 |
| ended_at | 0 | 1 | 2022-01-01 00:01:48 | 2023-01-02 04:56:45 | 2022-07-22 15:24:44 | 4758633 |
The number of rows in the data matches the number of unique ride_id, meaning I don’t need to look for duplicate trips.
The data summary above highlights several issues with the data: *
ride_length that are negative (should be impossible because it is a
measure of time length) * ride_length that are extremely high (41387.25
mins equal to almost 29 days) * missing data in end_lat and
end_lng * missing data in start_station_name,
start_station_id, end_station_name, and
end_station_id
Several trips are noticeably long, much longer than the average trip length (16.59 mins). Taking a look at the Divvy bike website (where the data originated from), the longest bike trip they offer are 3-hour rides for Day Pass holders. Each minute after the 3 hours are up will incur an extra charge. Therefore, it is reasonable to assume that rides should be capped at 180 minutes.
year_2022_filtered <- filter(year_2022, ride_length > 0 & ride_length <= 180)
Looking through the filtered data, I noticed several trips that are oddly short with the same starting and ending stations. I assume these trips are either false starts or other similar mistakes done by the users. Either way, I will remove these trips from the data set because they are not relevant to our aims.
year_2022_same_stations <- filter(year_2022_filtered, start_station_name == end_station_name)
skim_without_charts(year_2022_same_stations)
| Name | year_2022_same_stations |
| Number of rows | 290545 |
| Number of columns | 17 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| difftime | 1 |
| numeric | 5 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1 | 16 | 16 | 0 | 290545 | 0 |
| rideable_type | 0 | 1 | 12 | 13 | 0 | 2 | 0 |
| start_station_name | 0 | 1 | 10 | 64 | 0 | 1239 | 0 |
| start_station_id | 0 | 1 | 3 | 44 | 0 | 1114 | 0 |
| end_station_name | 0 | 1 | 10 | 64 | 0 | 1239 | 0 |
| end_station_id | 0 | 1 | 3 | 44 | 0 | 1115 | 0 |
| member_casual | 0 | 1 | 6 | 6 | 0 | 2 | 0 |
| day_of_ride | 0 | 1 | 6 | 9 | 0 | 7 | 0 |
| month_of_ride | 0 | 1 | 3 | 9 | 0 | 12 | 0 |
Variable type: difftime
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| ride_length | 0 | 1 | 0.02 mins | 179.98 mins | 10.5 mins | 9983 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.90 | 0.05 | 41.65 | 41.88 | 41.89 | 41.93 | 45.64 |
| start_lng | 0 | 1 | -87.64 | 0.04 | -87.83 | -87.66 | -87.64 | -87.62 | -73.80 |
| end_lat | 0 | 1 | 41.90 | 0.05 | 41.65 | 41.88 | 41.89 | 41.93 | 42.06 |
| end_lng | 0 | 1 | -87.64 | 0.03 | -87.83 | -87.66 | -87.64 | -87.62 | -87.53 |
| hour_of_ride | 0 | 1 | 14.53 | 4.95 | 0.00 | 12.00 | 15.00 | 18.00 | 23.00 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2022-01-01 00:00:05 | 2022-12-31 23:59:01 | 2022-07-16 15:21:58 | 286656 |
| ended_at | 0 | 1 | 2022-01-01 00:01:48 | 2023-01-01 00:57:51 | 2022-07-16 15:44:32 | 287120 |
year_2022_same_stations_NA <- filter(year_2022_same_stations, ride_length <= 1)
head(year_2022_same_stations_NA)
year_2022_filtered <- anti_join(year_2022_filtered, year_2022_same_stations_NA)
## Joining with `by = join_by(ride_id, rideable_type, started_at, ended_at,
## start_station_name, start_station_id, end_station_name, end_station_id,
## start_lat, start_lng, end_lat, end_lng, member_casual, day_of_ride,
## month_of_ride, hour_of_ride, ride_length)`
skim_without_charts(year_2022_filtered)
| Name | year_2022_filtered |
| Number of rows | 5573119 |
| Number of columns | 17 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| difftime | 1 |
| numeric | 5 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1.00 | 16 | 16 | 0 | 5573119 | 0 |
| rideable_type | 0 | 1.00 | 12 | 13 | 0 | 2 | 0 |
| start_station_name | 831946 | 0.85 | 7 | 64 | 0 | 1672 | 0 |
| start_station_id | 831946 | 0.85 | 3 | 44 | 0 | 1312 | 0 |
| end_station_name | 885450 | 0.84 | 9 | 64 | 0 | 1688 | 0 |
| end_station_id | 885450 | 0.84 | 3 | 44 | 0 | 1316 | 0 |
| member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
| day_of_ride | 0 | 1.00 | 6 | 9 | 0 | 7 | 0 |
| month_of_ride | 0 | 1.00 | 3 | 9 | 0 | 12 | 0 |
Variable type: difftime
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| ride_length | 0 | 1 | 0.02 mins | 180 mins | 10.38 mins | 10783 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.90 | 0.05 | 41.64 | 41.88 | 41.90 | 41.93 | 45.64 |
| start_lng | 0 | 1 | -87.65 | 0.03 | -87.84 | -87.66 | -87.64 | -87.63 | -73.80 |
| end_lat | 418 | 1 | 41.90 | 0.07 | 0.00 | 41.88 | 41.90 | 41.93 | 42.19 |
| end_lng | 418 | 1 | -87.65 | 0.11 | -88.14 | -87.66 | -87.64 | -87.63 | 0.00 |
| hour_of_ride | 0 | 1 | 14.22 | 5.03 | 0.00 | 11.00 | 15.00 | 18.00 | 23.00 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2022-01-01 00:00:05 | 2022-12-31 23:59:26 | 2022-07-22 14:39:30 | 4680220 |
| ended_at | 0 | 1 | 2022-01-01 00:01:48 | 2023-01-01 01:51:15 | 2022-07-22 14:58:01 | 4692046 |
filter(year_2022_filtered, start_station_name == end_station_name)
Next up is to clean up rows with missing end longitude and latitude values. I will separate them for now to take a closer look at it:
df_NA <- filter(year_2022_filtered, is.na(end_lat) | is.na(end_lng))
skim_without_charts(df_NA)
| Name | df_NA |
| Number of rows | 418 |
| Number of columns | 17 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| difftime | 1 |
| numeric | 5 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1 | 16 | 16 | 0 | 418 | 0 |
| rideable_type | 0 | 1 | 12 | 12 | 0 | 1 | 0 |
| start_station_name | 0 | 1 | 10 | 51 | 0 | 168 | 0 |
| start_station_id | 0 | 1 | 3 | 12 | 0 | 168 | 0 |
| end_station_name | 418 | 0 | NA | NA | 0 | 0 | 0 |
| end_station_id | 418 | 0 | NA | NA | 0 | 0 | 0 |
| member_casual | 0 | 1 | 6 | 6 | 0 | 2 | 0 |
| day_of_ride | 0 | 1 | 6 | 9 | 0 | 7 | 0 |
| month_of_ride | 0 | 1 | 3 | 9 | 0 | 12 | 0 |
Variable type: difftime
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| ride_length | 0 | 1 | 0.58 mins | 179.92 mins | 41.38 mins | 399 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.89 | 0.05 | 41.69 | 41.88 | 41.89 | 41.91 | 42.06 |
| start_lng | 0 | 1 | -87.63 | 0.03 | -87.73 | -87.64 | -87.63 | -87.61 | -87.53 |
| end_lat | 418 | 0 | NaN | NA | NA | NA | NA | NA | NA |
| end_lng | 418 | 0 | NaN | NA | NA | NA | NA | NA | NA |
| hour_of_ride | 0 | 1 | 13.51 | 4.36 | 0.00 | 11.00 | 14.00 | 16.75 | 23.00 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2022-01-07 19:46:52 | 2022-12-21 14:48:59 | 2022-07-23 20:21:38 | 418 |
| ended_at | 0 | 1 | 2022-01-07 21:40:38 | 2022-12-21 17:48:54 | 2022-07-23 20:54:07 | 367 |
All 418 rows with missing end_lat and end_lng values have missing end_station_name and end_station_id, which makes sense. Let’s investigate the table further.
summarise(df_NA, mean_length_NA = mean(ride_length))
summarise(year_2022_filtered, mean_length_overall = mean(ride_length))
NA_tabyl <- df_NA %>% tabyl(member_casual, rideable_type)
print(NA_tabyl)
## member_casual classic_bike
## casual 411
## member 7
The average ride_length of these trips with missing end values are much higher than the rest. It is also evident that the riders of these trips are casual members that are riding on docked_bike. With these information, I dare to make a guess that these data show trips that ended abnormally outside the bike stations, either because of traffic accidents, bike malfunctions, etc. This theory is further supported by the fact that majority of the riders are casuals that are perhaps renting a bike for the first time and not fully understanding how the bike rental system works, thus not being able to safely dock the bikes in proper stations.
Either way, there are only 418 trips that are in this category, which only accounts for less than 0.01% of the overall data. So, it can be safely removed without affecting the accuracy of the analysis.
year_2022_filtered <- anti_join(year_2022_filtered, df_NA)
## Joining with `by = join_by(ride_id, rideable_type, started_at, ended_at,
## start_station_name, start_station_id, end_station_name, end_station_id,
## start_lat, start_lng, end_lat, end_lng, member_casual, day_of_ride,
## month_of_ride, hour_of_ride, ride_length)`
skim_without_charts(year_2022_filtered)
| Name | year_2022_filtered |
| Number of rows | 5572701 |
| Number of columns | 17 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| difftime | 1 |
| numeric | 5 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1.00 | 16 | 16 | 0 | 5572701 | 0 |
| rideable_type | 0 | 1.00 | 12 | 13 | 0 | 2 | 0 |
| start_station_name | 831946 | 0.85 | 7 | 64 | 0 | 1672 | 0 |
| start_station_id | 831946 | 0.85 | 3 | 44 | 0 | 1312 | 0 |
| end_station_name | 885032 | 0.84 | 9 | 64 | 0 | 1688 | 0 |
| end_station_id | 885032 | 0.84 | 3 | 44 | 0 | 1316 | 0 |
| member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
| day_of_ride | 0 | 1.00 | 6 | 9 | 0 | 7 | 0 |
| month_of_ride | 0 | 1.00 | 3 | 9 | 0 | 12 | 0 |
Variable type: difftime
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| ride_length | 0 | 1 | 0.02 mins | 180 mins | 10.38 mins | 10783 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.90 | 0.05 | 41.64 | 41.88 | 41.90 | 41.93 | 45.64 |
| start_lng | 0 | 1 | -87.65 | 0.03 | -87.84 | -87.66 | -87.64 | -87.63 | -73.80 |
| end_lat | 0 | 1 | 41.90 | 0.07 | 0.00 | 41.88 | 41.90 | 41.93 | 42.19 |
| end_lng | 0 | 1 | -87.65 | 0.11 | -88.14 | -87.66 | -87.64 | -87.63 | 0.00 |
| hour_of_ride | 0 | 1 | 14.22 | 5.03 | 0.00 | 11.00 | 15.00 | 18.00 | 23.00 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2022-01-01 00:00:05 | 2022-12-31 23:59:26 | 2022-07-22 14:39:18 | 4679940 |
| ended_at | 0 | 1 | 2022-01-01 00:01:48 | 2023-01-01 01:51:15 | 2022-07-22 14:57:58 | 4691812 |
There are still at least 831946 rows with missing station data. Let’s take a closer look at them.
df_station_NA <- filter(year_2022_filtered, is.na(start_station_name) | is.na(start_station_id) | is.na(end_station_name) | is.na(end_station_id))
skim_without_charts(df_station_NA)
| Name | df_station_NA |
| Number of rows | 1290215 |
| Number of columns | 17 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| difftime | 1 |
| numeric | 5 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1.00 | 16 | 16 | 0 | 1290215 | 0 |
| rideable_type | 0 | 1.00 | 12 | 13 | 0 | 2 | 0 |
| start_station_name | 831946 | 0.36 | 7 | 64 | 0 | 1576 | 0 |
| start_station_id | 831946 | 0.36 | 3 | 37 | 0 | 1269 | 0 |
| end_station_name | 885032 | 0.31 | 9 | 64 | 0 | 1572 | 0 |
| end_station_id | 885032 | 0.31 | 3 | 37 | 0 | 1269 | 0 |
| member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
| day_of_ride | 0 | 1.00 | 6 | 9 | 0 | 7 | 0 |
| month_of_ride | 0 | 1.00 | 3 | 9 | 0 | 12 | 0 |
Variable type: difftime
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| ride_length | 0 | 1 | 0.02 mins | 180 mins | 9.17 mins | 8751 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.90 | 0.06 | 41.64 | 41.88 | 41.91 | 41.94 | 42.07 |
| start_lng | 0 | 1 | -87.66 | 0.04 | -87.84 | -87.68 | -87.65 | -87.63 | -87.52 |
| end_lat | 0 | 1 | 41.90 | 0.06 | 41.59 | 41.88 | 41.91 | 41.94 | 42.19 |
| end_lng | 0 | 1 | -87.66 | 0.04 | -88.14 | -87.68 | -87.65 | -87.63 | -87.50 |
| hour_of_ride | 0 | 1 | 14.27 | 5.22 | 0.00 | 11.00 | 15.00 | 18.00 | 23.00 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2022-01-01 00:05:26 | 2022-12-31 23:55:07 | 2022-07-27 18:35:33 | 1240221 |
| ended_at | 0 | 1 | 2022-01-01 00:16:56 | 2023-01-01 00:28:47 | 2022-07-27 18:48:53 | 1239894 |
summarize(df_station_NA, mean_trip_length = mean(ride_length))
Looking at the data, there are several data with missing station names that have suspiciously short ride length. Before when I was filtering out short trips, I did so because they had the same start and end stations, indicating that they were possibly false starts. Now, with missing start and end station data, I think it is also quite possible that they could be false starts. So, I will also remove trips that are less than 1 minute long, with missing start and end station data.
As for the rest, it turns out that Divvy allows bikes to be rented and returned from outside stations, explaining the missing data. So, I will keep the others.
df_station_NA2 <- filter(df_station_NA, ride_length < 1 & is.na(start_station_name) & is.na(start_station_id) & is.na(end_station_name) & is.na(end_station_id))
head(df_station_NA2)
year_2022_filtered <- anti_join(year_2022_filtered, df_station_NA2)
## Joining with `by = join_by(ride_id, rideable_type, started_at, ended_at,
## start_station_name, start_station_id, end_station_name, end_station_id,
## start_lat, start_lng, end_lat, end_lng, member_casual, day_of_ride,
## month_of_ride, hour_of_ride, ride_length)`
skim_without_charts(year_2022_filtered)
| Name | year_2022_filtered |
| Number of rows | 5543908 |
| Number of columns | 17 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| difftime | 1 |
| numeric | 5 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1.00 | 16 | 16 | 0 | 5543908 | 0 |
| rideable_type | 0 | 1.00 | 12 | 13 | 0 | 2 | 0 |
| start_station_name | 803153 | 0.86 | 7 | 64 | 0 | 1672 | 0 |
| start_station_id | 803153 | 0.86 | 3 | 44 | 0 | 1312 | 0 |
| end_station_name | 856239 | 0.85 | 9 | 64 | 0 | 1688 | 0 |
| end_station_id | 856239 | 0.85 | 3 | 44 | 0 | 1316 | 0 |
| member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
| day_of_ride | 0 | 1.00 | 6 | 9 | 0 | 7 | 0 |
| month_of_ride | 0 | 1.00 | 3 | 9 | 0 | 12 | 0 |
Variable type: difftime
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| ride_length | 0 | 1 | 0.02 mins | 180 mins | 10.45 mins | 10783 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.90 | 0.05 | 41.64 | 41.88 | 41.90 | 41.93 | 45.64 |
| start_lng | 0 | 1 | -87.65 | 0.03 | -87.84 | -87.66 | -87.64 | -87.63 | -73.80 |
| end_lat | 0 | 1 | 41.90 | 0.07 | 0.00 | 41.88 | 41.90 | 41.93 | 42.19 |
| end_lng | 0 | 1 | -87.65 | 0.11 | -88.14 | -87.66 | -87.64 | -87.63 | 0.00 |
| hour_of_ride | 0 | 1 | 14.22 | 5.03 | 0.00 | 11.00 | 15.00 | 18.00 | 23.00 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2022-01-01 00:00:05 | 2022-12-31 23:59:26 | 2022-07-22 13:52:16 | 4659020 |
| ended_at | 0 | 1 | 2022-01-01 00:01:48 | 2023-01-01 01:51:15 | 2022-07-22 14:10:26 | 4670845 |
sort_trips <- year_2022_filtered %>% arrange(ride_length)
sort_trips
Applying this concept of false starts to the original dataframe, I found a lot of trips that are also really short and is missing either one of station data, strongly implying them being false trips as well. These data will be removed.
sort_trips <- filter(sort_trips, (is.na(start_station_name) | is.na(end_station_name)) & ride_length < 1 )
skim_without_charts(sort_trips)
| Name | sort_trips |
| Number of rows | 15420 |
| Number of columns | 17 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| difftime | 1 |
| numeric | 5 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1.00 | 16 | 16 | 0 | 15420 | 0 |
| rideable_type | 0 | 1.00 | 13 | 13 | 0 | 1 | 0 |
| start_station_name | 2141 | 0.86 | 10 | 53 | 0 | 671 | 0 |
| start_station_id | 2141 | 0.86 | 3 | 36 | 0 | 659 | 0 |
| end_station_name | 13279 | 0.14 | 10 | 51 | 0 | 524 | 0 |
| end_station_id | 13279 | 0.14 | 3 | 12 | 0 | 517 | 0 |
| member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
| day_of_ride | 0 | 1.00 | 6 | 9 | 0 | 7 | 0 |
| month_of_ride | 0 | 1.00 | 3 | 9 | 0 | 12 | 0 |
Variable type: difftime
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| ride_length | 0 | 1 | 0.02 mins | 0.98 mins | 0.33 mins | 59 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.90 | 0.04 | 41.66 | 41.88 | 41.90 | 41.93 | 42.06 |
| start_lng | 0 | 1 | -87.65 | 0.03 | -87.83 | -87.66 | -87.64 | -87.63 | -87.53 |
| end_lat | 0 | 1 | 41.90 | 0.04 | 41.66 | 41.88 | 41.90 | 41.93 | 42.06 |
| end_lng | 0 | 1 | -87.65 | 0.03 | -87.83 | -87.66 | -87.64 | -87.63 | -87.53 |
| hour_of_ride | 0 | 1 | 14.14 | 5.11 | 0.00 | 11.00 | 15.00 | 18.00 | 23.00 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2022-01-01 02:36:06 | 2022-12-31 22:17:20 | 2022-08-04 22:47:11 | 15409 |
| ended_at | 0 | 1 | 2022-01-01 02:36:09 | 2022-12-31 22:18:10 | 2022-08-04 22:47:37 | 15410 |
year_2022_filtered <- anti_join(year_2022_filtered, sort_trips)
## Joining with `by = join_by(ride_id, rideable_type, started_at, ended_at,
## start_station_name, start_station_id, end_station_name, end_station_id,
## start_lat, start_lng, end_lat, end_lng, member_casual, day_of_ride,
## month_of_ride, hour_of_ride, ride_length)`
skim_without_charts(year_2022_filtered)
| Name | year_2022_filtered |
| Number of rows | 5528488 |
| Number of columns | 17 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| difftime | 1 |
| numeric | 5 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1.00 | 16 | 16 | 0 | 5528488 | 0 |
| rideable_type | 0 | 1.00 | 12 | 13 | 0 | 2 | 0 |
| start_station_name | 801012 | 0.86 | 7 | 64 | 0 | 1671 | 0 |
| start_station_id | 801012 | 0.86 | 3 | 44 | 0 | 1311 | 0 |
| end_station_name | 842960 | 0.85 | 9 | 64 | 0 | 1687 | 0 |
| end_station_id | 842960 | 0.85 | 3 | 44 | 0 | 1316 | 0 |
| member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
| day_of_ride | 0 | 1.00 | 6 | 9 | 0 | 7 | 0 |
| month_of_ride | 0 | 1.00 | 3 | 9 | 0 | 12 | 0 |
Variable type: difftime
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| ride_length | 0 | 1 | 0.02 mins | 180 mins | 10.48 mins | 10783 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.90 | 0.05 | 41.64 | 41.88 | 41.90 | 41.93 | 45.64 |
| start_lng | 0 | 1 | -87.65 | 0.03 | -87.84 | -87.66 | -87.64 | -87.63 | -73.80 |
| end_lat | 0 | 1 | 41.90 | 0.07 | 0.00 | 41.88 | 41.90 | 41.93 | 42.19 |
| end_lng | 0 | 1 | -87.65 | 0.11 | -88.14 | -87.66 | -87.64 | -87.63 | 0.00 |
| hour_of_ride | 0 | 1 | 14.22 | 5.03 | 0.00 | 11.00 | 15.00 | 18.00 | 23.00 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2022-01-01 00:00:05 | 2022-12-31 23:59:26 | 2022-07-22 13:11:46 | 4648109 |
| ended_at | 0 | 1 | 2022-01-01 00:01:48 | 2023-01-01 01:51:15 | 2022-07-22 13:29:47 | 4659939 |
sort_trips_2 <- year_2022_filtered %>% arrange(ride_length)
sort_trips_2
Even after all this data cleaning, there are still trips with nonsensical length. Sorting it by ride_length reveals how some trips last for only 1 second, which doesn’t make sense. I then noticed how some of these trips have station IDs like “Hubbard Bike-checking (LBS-WH-TEST)”, which implies it is an irregular station. Doing some research on the internet leads me to realizing that some stations are used only as maintenance/service stations. These stations are:
Therefore, I will remove them from the overall data.
service_trips <- filter(year_2022_filtered,
start_station_name == "Base - 2132 W Hubbard Warehouse" |
end_station_name == "Base - 2132 W Hubbard Warehouse" |
start_station_name == "Base - 2132 W Hubbard" |
end_station_name == "Base - 2132 W Hubbard" |
start_station_name == "HUBBARD ST BIKE CHECKING (LBS-WH-TEST)" |
end_station_name == "HUBBARD ST BIKE CHECKING (LBS-WH-TEST)" |
start_station_name == "hubbard_test_lws" |
end_station_name == "hubbard_test_lws" |
start_station_name == "WATSON TESTING - DIVVY" |
end_station_name == "WATSON TESTING - DIVVY" |
start_station_name == "WEST CHI-WATSON" |
end_station_name == "WEST CHI-WATSON"
)
skim_without_charts(service_trips)
| Name | service_trips |
| Number of rows | 2182 |
| Number of columns | 17 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| difftime | 1 |
| numeric | 5 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1.00 | 16 | 16 | 0 | 2182 | 0 |
| rideable_type | 0 | 1.00 | 12 | 13 | 0 | 2 | 0 |
| start_station_name | 36 | 0.98 | 10 | 36 | 0 | 54 | 0 |
| start_station_id | 36 | 0.98 | 3 | 35 | 0 | 54 | 0 |
| end_station_name | 770 | 0.65 | 10 | 51 | 0 | 460 | 0 |
| end_station_id | 770 | 0.65 | 3 | 35 | 0 | 452 | 0 |
| member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
| day_of_ride | 0 | 1.00 | 6 | 9 | 0 | 7 | 0 |
| month_of_ride | 0 | 1.00 | 3 | 9 | 0 | 12 | 0 |
Variable type: difftime
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| ride_length | 0 | 1 | 0.02 mins | 166.47 mins | 10.85 mins | 1234 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.89 | 0.06 | 41.65 | 41.89 | 41.90 | 41.93 | 41.99 |
| start_lng | 0 | 1 | -87.69 | 0.04 | -87.83 | -87.71 | -87.69 | -87.67 | -87.53 |
| end_lat | 0 | 1 | 41.89 | 0.06 | 41.65 | 41.88 | 41.90 | 41.93 | 42.03 |
| end_lng | 0 | 1 | -87.69 | 0.04 | -87.83 | -87.71 | -87.68 | -87.65 | -87.53 |
| hour_of_ride | 0 | 1 | 13.92 | 4.92 | 0.00 | 11.00 | 14.00 | 17.00 | 23.00 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2022-01-03 05:43:17 | 2022-12-19 11:55:32 | 2022-07-12 12:15:39 | 2181 |
| ended_at | 0 | 1 | 2022-01-03 05:58:46 | 2022-12-19 14:42:00 | 2022-07-12 12:18:48 | 2182 |
year_2022_filtered <- anti_join(year_2022_filtered, service_trips)
## Joining with `by = join_by(ride_id, rideable_type, started_at, ended_at,
## start_station_name, start_station_id, end_station_name, end_station_id,
## start_lat, start_lng, end_lat, end_lng, member_casual, day_of_ride,
## month_of_ride, hour_of_ride, ride_length)`
skim_without_charts(year_2022_filtered)
| Name | year_2022_filtered |
| Number of rows | 5526306 |
| Number of columns | 17 |
| _______________________ | |
| Column type frequency: | |
| character | 9 |
| difftime | 1 |
| numeric | 5 |
| POSIXct | 2 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ride_id | 0 | 1.00 | 16 | 16 | 0 | 5526306 | 0 |
| rideable_type | 0 | 1.00 | 12 | 13 | 0 | 2 | 0 |
| start_station_name | 800976 | 0.86 | 7 | 64 | 0 | 1668 | 0 |
| start_station_id | 800976 | 0.86 | 3 | 44 | 0 | 1309 | 0 |
| end_station_name | 842190 | 0.85 | 9 | 64 | 0 | 1684 | 0 |
| end_station_id | 842190 | 0.85 | 3 | 44 | 0 | 1315 | 0 |
| member_casual | 0 | 1.00 | 6 | 6 | 0 | 2 | 0 |
| day_of_ride | 0 | 1.00 | 6 | 9 | 0 | 7 | 0 |
| month_of_ride | 0 | 1.00 | 3 | 9 | 0 | 12 | 0 |
Variable type: difftime
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| ride_length | 0 | 1 | 0.02 mins | 180 mins | 10.48 mins | 10783 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| start_lat | 0 | 1 | 41.90 | 0.05 | 41.64 | 41.88 | 41.90 | 41.93 | 45.64 |
| start_lng | 0 | 1 | -87.65 | 0.03 | -87.84 | -87.66 | -87.64 | -87.63 | -73.80 |
| end_lat | 0 | 1 | 41.90 | 0.07 | 0.00 | 41.88 | 41.90 | 41.93 | 42.19 |
| end_lng | 0 | 1 | -87.65 | 0.11 | -88.14 | -87.66 | -87.64 | -87.63 | 0.00 |
| hour_of_ride | 0 | 1 | 14.22 | 5.03 | 0.00 | 11.00 | 15.00 | 18.00 | 23.00 |
Variable type: POSIXct
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| started_at | 0 | 1 | 2022-01-01 00:00:05 | 2022-12-31 23:59:26 | 2022-07-22 13:16:11 | 4646499 |
| ended_at | 0 | 1 | 2022-01-01 00:01:48 | 2023-01-01 01:51:15 | 2022-07-22 13:34:35 | 4658340 |
sort_trips_3 <- year_2022_filtered %>% arrange(ride_length)
sort_trips_3
Despite all my cleaning efforts on the data, there are still trips that are seconds long. I will now take a much closer look at them.
filter(sort_trips_3, start_station_name == "Lincoln Ave & Roscoe St" & end_station_name == "N Paulina St & Lincoln Ave")
The shortest trip on the data is from Lincoln Ave & Roscoe St station to N Paulina St & Lincoln Ave station. I tried putting these stations on Google Maps and it turns out that these two stations are only 12 meters apart. While difficult and unreasonable, it is not strange that trips from these two stations have been marked as 1 second long by the app. Therefore, I can’t rule out trips that are only seconds long. As it won’t really affect the analysis, I will be keeping those trips instead.
Therefore, I am now done with the data cleaning process and will be moving on to the Analysis step.
# year_2022_filtered %>% write.csv(file = "all_trips_2022_cleaned_v1.csv")
member_tabyl <- tabyl(year_2022_filtered, member_casual)
member_tabyl$percent <- member_tabyl$percent * 100
member_proportion <- ggplot(data = member_tabyl) +
geom_bar(mapping = aes(x = member_casual, y = percent), stat='identity', width = 0.4) +
labs(title = "Bike Trips by Member Type",
x = "Member Type",
y = "Percentage")
member_proportion
The data shows that there are more annual membership holders than casual riders during the entirety of 2022. Members account for 59% of all trips, while casuals account for almost 41%.
member_tabyl_rideable <- year_2022_filtered %>% tabyl(member_casual, rideable_type)
print(member_tabyl_rideable)
## member_casual classic_bike electric_bike
## casual 1039538 1217543
## member 1682841 1586384
member_tabyl_rideable$total <-
member_tabyl_rideable$classic_bike + member_tabyl_rideable$electric_bike
member_tabyl_rideable$classic_percentage <-
member_tabyl_rideable$classic_bike / member_tabyl_rideable$total * 100
member_tabyl_rideable$electric_percentage <-
member_tabyl_rideable$electric_bike / member_tabyl_rideable$total * 100
data_long <- pivot_longer(member_tabyl_rideable, cols =
c(classic_percentage, electric_percentage),
names_to = "bike_type",
values_to = "percentage")
ggplot(data_long, aes(x = member_casual, y = percentage, fill = bike_type)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Bike Usage by Member Type",
x = "Member Type",
y = "Percentage") +
scale_fill_discrete(name = "Bike Type", labels = c("Classic Bike", "Electric Bike")) +
theme_minimal()
The data also shows that casual riders prefer electric bikes over
classic bikes, while the inverse is true for members. This shows that
the company needs to offer better deals for electric bikes to attract
casuals to purchase annual memberships.
day_tabyl <- tabyl(year_2022_filtered, day_of_ride, member_casual)
day_tabyl
day_tabyl$total <- day_tabyl$casual + day_tabyl$member
day_order <- c("Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday")
day_tabyl$day_of_ride <- factor(day_tabyl$day_of_ride, levels = day_order)
data_long_3 <- pivot_longer(day_tabyl, cols =
c(casual, member),
names_to = "membership",
values_to = "total_trips")
day_plot <- ggplot(data = data_long_3, mapping = aes(x = day_of_ride, y = total_trips, fill = membership)) +
geom_bar(stat = 'identity', position = "dodge") +
labs(title = "Bike Usage by Day per Membership Type",
x = "Day of Week",
y = "Total Trips") +
scale_fill_discrete(name = "Membership Type", labels = c("Casual", "Member")) +
theme_minimal()
day_plot <- day_plot +
scale_y_continuous(labels = scales::number_format(scale = 1e-2))
day_plot
The graph shows that the days with the most casual riders are the
weekends, while the total number of member riders increase during the
middle of the week. This implies that most casual riders are most likely
people who don’t ride bikes to commute, meaning they only need bike
rides during the weekends when they have free time to travel. On the
other hand, members are most likely people who need bikes to commute to
work or school, thus resulting in high mid-week numbers.
hour_tabyl <- tabyl(year_2022_filtered, hour_of_ride, member_casual)
hour_tabyl
data_long_4 <- pivot_longer(hour_tabyl, cols =
c(casual, member),
names_to = "membership",
values_to = "total_trips"
)
hour_plot <- ggplot(data = data_long_4, mapping = aes(x = hour_of_ride, y = total_trips, fill = membership)) +
geom_bar(stat = 'identity', position = "dodge") +
labs(title = "Bike Usage by Hour of Day per Membership Type",
x = "Hour of Day",
y = "Total Trips") +
scale_fill_discrete(name = "Membership Type", labels = c("Casual", "Member")) +
theme_minimal()
hour_plot <- hour_plot +
scale_y_continuous(labels = scales::number_format(scale = 1e-2))
hour_plot <- hour_plot +
scale_x_continuous(breaks = unique(hour_tabyl$hour_of_ride))
hour_plot
As the day advances, the number of people renting bikes increases,
peaking at around 5PM, until it then decreases as night falls and people
return home to rest. From the graph, it is apparent that the biggest
disparity between casual riders and member riders occur at 7-8AM and
5PM. To increase the number of memberships, it could be worthwhile to
create some sort of incentive program, where if membership holders ride
on certain ‘happy hours’ for some distance, they can gain rewards or
coupons. This will then give a reason for casual riders to consider
getting a membership.
month_tabyl <- tabyl(year_2022_filtered, month_of_ride, member_casual)
month_tabyl
month_order <- c("January",
"February",
"March",
"April",
"May",
"June",
"July",
"August",
"September",
"October",
"November",
"December")
month_tabyl$month_of_ride <- factor(month_tabyl$month_of_ride, levels = month_order)
data_long_5 <- pivot_longer(month_tabyl, cols =
c(casual, member),
names_to = "membership",
values_to = "total_trips"
)
month_plot <- ggplot(data = data_long_5, mapping = aes(x = month_of_ride, y = total_trips, fill = membership)) +
geom_bar(stat = 'identity', position = "dodge") +
labs(title = "Bike Usage by Month per Membership Type",
x = "Hour of Day",
y = "Total Trips") +
scale_fill_discrete(name = "Membership Type", labels = c("Casual", "Member")) +
theme_minimal()
month_plot <- month_plot +
scale_y_continuous(labels = scales::number_format(scale = 1e-2))
month_plot <- month_plot +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
month_plot
The number of riders seem to peak during the middle of the year, when
the weather is not too cold. This is evident by the number decreasing
during the winter and increasing again as the weather gets warmer as it
approaches summer.
The graph shows the greatest disparities between member and casual riders occur during the months September to December. Therefore, to convince these casual riders to buy memberships, I would suggest offering discounted membership prices during these months.
TODO: - Average times - Stations
start_station_tabyl <- tabyl(year_2022_filtered, start_station_name, member_casual)
start_station_tabyl$difference <- start_station_tabyl$member - start_station_tabyl$casual
popular_stations_casual <- head(arrange(start_station_tabyl, desc(casual)), n = 11)
popular_stations_casual <- filter(popular_stations_casual, !is.na(start_station_name))
popular_stations_member <- head(arrange(start_station_tabyl, desc(member)), n = 11)
popular_stations_member <- filter(popular_stations_member, !is.na(start_station_name))
biggest_difference_membership <- head(arrange(start_station_tabyl, desc(difference), n = 10))
biggest_difference_membership <- filter(biggest_difference_membership, !is.na(start_station_name))
biggest_difference_membership
stations_plot <- ggplot(data = biggest_difference_membership, mapping = aes(x = start_station_name, y = difference)) +
geom_bar(stat = 'identity', position = "dodge") +
labs(title = "Most Popular Stations for Casual Riders",
x = "Station Name",
y = "Total Trips") +
theme_minimal()
stations_plot <- stations_plot +
scale_y_continuous(labels = scales::number_format(scale = 1e-2))
stations_plot <- stations_plot +
theme(axis.text.x = element_text(size = 8))
stations_plot
The above graph shows 5 stations with the biggest disparity between members and casuals. These stations have the biggest difference between riders with memberships, and riders who don’t. Therefore, I suggest focusing marketing efforts towards these areas.
I tried to look for these areas on Google Maps, and it would appear that Ellis Ave & 60th St and University Ave & 57th St are stations that are very close to the University of Chicago and Washington Park, two areas with common need for public transport, so the marketing should focus towards attracting casual riders with this fact.
In conclusion, I found out several interesting insights regarding casual riders and member riders, such as:
To raise the number of membership subscriptions, based on the insights I gained above, I propose the following solutions: